knitr document van Steensel lab

TRIP Clone identification

Introduction

Description of Data

How to make a good rendering table:

column1 column2 column3
1 2 3
a b c

Data processing

Path, Libraries, Parameters and Useful Functions

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:data.table':
## 
##     first, second
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:data.table':
## 
##     shift
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
## 
##     count
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
## 
## Attaching package: 'Laurae'
## The following object is masked from 'package:matrixStats':
## 
##     mean2
## The following object is masked from 'package:data.table':
## 
##     setDF
## Loading required package: magrittr

Custom functions

Functions used thoughout this script.

p.adjust <- function(p, method = p.adjust.methods, n = length(p))
{
    ## Methods 'Hommel', 'BH', 'BY' and speed improvements
    ## contributed by Gordon Smyth
    method <- match.arg(method)
    if(method == "fdr") method <- "BH"  # back compatibility
    nm <- names(p)
    p <- as.numeric(p)
    p0 <- setNames(p, nm)
    if(all(nna <- !is.na(p))) nna <- TRUE
    p <- p[nna]
    lp <- length(p)
    # stopifnot(n >= lp)
    if (n <= 1) return(p0)
    if (n == 2 && method == "hommel") method <- "hochberg"

    p0[nna] <-
    switch(method,
           bonferroni = pmin(1, n * p),
           holm = {
           i <- seq_len(lp)
           o <- order(p)
           ro <- order(o)
           pmin(1, cummax( (n - i + 1L) * p[o] ))[ro]
           },
           hommel = { ## needs n-1 >= 2 in for() below
           if(n > lp) p <- c(p, rep.int(1, n-lp))
           i <- seq_len(n)
           o <- order(p)
           p <- p[o]
           ro <- order(o)
           q <- pa <- rep.int( min(n*p/i), n)
           for (j in (n-1):2) {
               ij <- seq_len(n-j+1)
               i2 <- (n-j+2):n
               q1 <- min(j*p[i2]/(2:j))
               q[ij] <- pmin(j*p[ij], q1)
               q[i2] <- q[n-j+1]
               pa <- pmax(pa,q)
           }
           pmax(pa,p)[if(lp < n) ro[1:lp] else ro]
           },
           hochberg = {
           i <- lp:1L
           o <- order(p, decreasing = TRUE)
           ro <- order(o)
           pmin(1, cummin( (n - i + 1L) * p[o] ))[ro]
           },
           BH = {
           i <- lp:1L
           o <- order(p, decreasing = TRUE)
           ro <- order(o)
           pmin(1, cummin( n / i * p[o] ))[ro]
           },
           BY = {
           i <- lp:1L
           o <- order(p, decreasing = TRUE)
           ro <- order(o)
           q <- sum(1L/(1L:n))
           pmin(1, cummin(q * n / i * p[o]))[ro]
           },
           none = p)
    p0
}

SetFileName <- function(filename, initials) {
  # Set filename with extension and initials to make filename with date integrated.
  filename <- substitute(filename)
  initials <- substitute(initials)
  filename <- paste0(initials, Date, filename)
  filename
}

# From Fede:
# ggpairs custom functions
corColor <- function(data, mapping, color = I("black"), sizeRange = c(1, 3), ...) {

  x   <- eval_data_col(data, mapping$x)
  y   <- eval_data_col(data, mapping$y)
  r   <- cor(x, y)
  rt  <- format(r, digits = 3)
  tt  <- as.character(rt)
  cex <- max(sizeRange)

  # helper function to calculate a useable size
  percent_of_range <- function(percent, range) {
    percent * diff(range) + min(range, na.rm = TRUE)
  }

  # plot correlation coefficient
  p <- ggally_text(label = tt, mapping = aes(), xP = 0.5, yP = 0.5,
                   size = I(percent_of_range(cex * abs(r), sizeRange)), color = color, ...) +
    theme(panel.grid.minor=element_blank(),
          panel.grid.major=element_blank())

  corColors <- RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")[2:6]

  if (r <= boundaries[1]) {
    corCol <- corColors[1]
  } else if (r <= boundaries[3]) {
    corCol <- corColors[2]
  } else if (r < boundaries[3]) {
    corCol <- corColors[3]
  } else if (r < boundaries[4]) {
    corCol <- corColors[4]
  } else {
    corCol <- corColors[5]
  }

  p <- p +
    theme(panel.background = element_rect(fill = corCol))

  return(p)
}


# stat_smooth_func <- function(mapping = NULL, data = NULL,
#                         geom = "smooth", position = "identity",
#                         ...,
#                         method = "auto",
#                         formula = y ~ x,
#                         se = TRUE,
#                         n = 80,
#                         span = 0.75,
#                         fullrange = FALSE,
#                         level = 0.95,
#                         method.args = list(),
#                         na.rm = FALSE,
#                         show.legend = NA,
#                         inherit.aes = TRUE,
#                         xpos = NULL,
#                         ypos = NULL) {
#   layer(
#     data = data,
#     mapping = mapping,
#     stat = StatSmoothFunc,
#     geom = geom,
#     position = position,
#     show.legend = show.legend,
#     inherit.aes = inherit.aes,
#     params = list(
#       method = method,
#       formula = formula,
#       se = se,
#       n = n,
#       fullrange = fullrange,
#       level = level,
#       na.rm = na.rm,
#       method.args = method.args,
#       span = span,
#       xpos = xpos,
#       ypos = ypos,
#       ...
#     )
#   )
# }
# 
# StatSmoothFunc <- ggproto("StatSmooth", Stat,
#                       
#                       setup_params = function(data, params) {
#                         # Figure out what type of smoothing to do: loess for small datasets,
#                         # gam with a cubic regression basis for large data
#                         # This is based on the size of the _largest_ group.
#                         if (identical(params$method, "auto")) {
#                           max_group <- max(table(data$group))
#                           
#                           if (max_group < 1000) {
#                             params$method <- "loess"
#                           } else {
#                             params$method <- "gam"
#                             params$formula <- y ~ s(x, bs = "cs")
#                           }
#                         }
#                         if (identical(params$method, "gam")) {
#                           params$method <- mgcv::gam
#                         }
#                         
#                         params
#                       },
#                       
#                       compute_group = function(data, scales, method = "auto", formula = y~x,
#                                                se = TRUE, n = 80, span = 0.75, fullrange = FALSE,
#                                                xseq = NULL, level = 0.95, method.args = list(),
#                                                na.rm = FALSE, xpos=NULL, ypos=NULL) {
#                         if (length(unique(data$x)) < 2) {
#                           # Not enough data to perform fit
#                           return(data.frame())
#                         }
#                         
#                         if (is.null(data$weight)) data$weight <- 1
#                         
#                         if (is.null(xseq)) {
#                           if (is.integer(data$x)) {
#                             if (fullrange) {
#                               xseq <- scales$x$dimension()
#                             } else {
#                               xseq <- sort(unique(data$x))
#                             }
#                           } else {
#                             if (fullrange) {
#                               range <- scales$x$dimension()
#                             } else {
#                               range <- range(data$x, na.rm = TRUE)
#                             }
#                             xseq <- seq(range[1], range[2], length.out = n)
#                           }
#                         }
#                         # Special case span because it's the most commonly used model argument
#                         if (identical(method, "loess")) {
#                           method.args$span <- span
#                         }
#                         
#                         if (is.character(method)) method <- match.fun(method)
#                         
#                         base.args <- list(quote(formula), data = quote(data), weights = quote(weight))
#                         model <- do.call(method, c(base.args, method.args))
#                         
#                         m = model
#                         eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
#                                          list(a = format(coef(m)[1], digits = 3), 
#                                               b = format(coef(m)[2], digits = 3), 
#                                               r2 = format(summary(m)$r.squared, digits = 3)))
#                         func_string = as.character(as.expression(eq))
#                         
#                         if(is.null(xpos)) xpos = min(data$x)*0.9
#                         if(is.null(ypos)) ypos = max(data$y)*0.9
#                         data.frame(x=xpos, y=ypos, label=func_string)
#                         
#                       },
#                       
#                       required_aes = c("x", "y")
# )

Visualize viability data

## Warning in melt(viability): The melt generic in data.table has been passed
## a data.frame and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(viability). In the next version, this warning will become an
## error.
## Using X as id variables

Data pre-processing - clean up dataframe and add important specifiers

# Add viability data to indel.data
indel.data <- merge(viability.data, indel.data) 
setnames(indel.data, old = "V2", new = "rn")

# Add previously estimated +1/-7 ratio data for correlation check-ups
indel.data <- merge(mean.ratio.data, indel.data, all = T)
all.indel.data <- indel.data

# Rename columns
setnames(indel.data, old = ".id", new = "condition")
setnames(indel.data, old = "rn", new = "barcode")

# Remove barcodes with abnormal counts from analysis
indel.data <- indel.data[indel.data$barcode != "ACCCCTAAAGGCGCTG" &
                           indel.data$barcode != "CTCTTAATCGCTGCC",]


# Set a threshold on minimun +1 and -7 read counts to exclude noisy data
low.indel.count <- indel.data[indel.data$read.count1.7 < 30,]
low.indel.count <- low.indel.count[low.indel.count$X0 > 100,]
low.indel.count <- low.indel.count[!low.indel.count$Drug %in% "PAO",]

indel.data <- indel.data[indel.data$read.count1.7 > 30,]

# Remove 0, NAs, Infs
indel.data <- indel.data[is.finite(indel.data$ratio),]
indel.data <- indel.data[indel.data$ratio != 0,]

# Add plate specifier
indel.data$plate.nr <- "1"
indel.data[grep("10um_rep1_plate2", indel.data$condition),]$plate.nr <- "2"
indel.data[grep("10um_rep2_plate1", indel.data$condition),]$plate.nr <- "3"
indel.data[grep("10um_rep2_plate2", indel.data$condition),]$plate.nr <- "4"
indel.data[grep("10um_rep3_plate1", indel.data$condition),]$plate.nr <- "5"
indel.data[grep("10um_rep3_plate2", indel.data$condition),]$plate.nr <- "6"
indel.data[grep("1um_rep1_plate1", indel.data$condition),]$plate.nr <- "7"
indel.data[grep("1um_rep1_plate2", indel.data$condition),]$plate.nr <- "8"
indel.data[grep("1um_rep2_plate1", indel.data$condition),]$plate.nr <- "9"
indel.data[grep("1um_rep2_plate2", indel.data$condition),]$plate.nr <- "10"
indel.data[grep("1um_rep3_plate1", indel.data$condition),]$plate.nr <- "11"
indel.data[grep("1um_rep3_plate2", indel.data$condition),]$plate.nr <- "12"
indel.data[grep("100nm_rep1_plate1", indel.data$condition),]$plate.nr <- "13"
indel.data[grep("100nm_rep1_plate2", indel.data$condition),]$plate.nr <- "14"
indel.data[grep("100nm_rep2_plate1", indel.data$condition),]$plate.nr <- "15"
indel.data[grep("100nm_rep2_plate2", indel.data$condition),]$plate.nr <- "16"
indel.data[grep("100nm_rep3_plate1", indel.data$condition),]$plate.nr <- "17"
indel.data[grep("100nm_rep3_plate2", indel.data$condition),]$plate.nr <- "18"

## Compute some useful entities for data analysis
# Take log values of ratio
indel.data$logratio <- ave(indel.data$ratio, FUN = function(x) log2(x))

# Compute relative viability
indel.data$viability <- ave(indel.data$viability, indel.data$plate.nr, FUN = function(x) x/max(x))

# We want to exclude reads from cells that have reduced viability (set cut-off at 45% reduced viability)
indel.data <- indel.data[indel.data$viability > 0.45,]


## Add some more classifiers
indel.data$classifier <- gsub("_rep[0-9]", "\\1", indel.data$condition)
indel.data$uniqueID <- paste(indel.data$barcode, indel.data$classifier)

# Optional: Exclude data completely (remove the remaining datapoint) if 2 out of 3 replicates were already excluded untill now
## indel.data <- indel.data[indel.data$uniqueID %in% names(which(table(indel.data$uniqueID) > 1)), ]

Analysis

Data analysis

## Normalize the data
# Plate normalization: substract log2(DMSO.mean)ratio from log2(drug)ratio per plate
for (i in 1:18) {
  for (j in unique(indel.data$barcode)) {
  dmso.mean <- mean(indel.data$logratio[indel.data$Drug == 
                        "DMSO" & indel.data$plate.nr == 
                        i & indel.data$barcode == j])
  indel.data$logratio.platenorm[indel.data$plate.nr == i & indel.data$barcode == j] <- 
    indel.data$logratio[indel.data$plate.nr == i & indel.data$barcode == j] -
    dmso.mean
  }
}

# Plate normalization for efficiency
for (i in 1:18) {
  for (j in unique(indel.data$barcode)) {
  dmso.eff.mean <- mean(indel.data$efficiency[indel.data$Drug == 
                        "DMSO" & indel.data$plate.nr == 
                        i & indel.data$barcode == j])
  indel.data$efficiency.platenorm[indel.data$plate.nr == i & indel.data$barcode == j] <-
    indel.data$efficiency[indel.data$plate.nr == i & indel.data$barcode == j] -
    dmso.eff.mean
  }
}

## Perform t-tests - save in different file

# t-test for target groups
indel.data.aurora <- indel.data[indel.data$Target == "Aurora Kinase" |
                            indel.data$Drug == "DMSO" &
                            indel.data$conc == "1um",]
indel.data.aurora$logratio.drug <- ave(indel.data.aurora$logratio.platenorm, 
                                 indel.data.aurora$Target,
                                 indel.data.aurora$rep,
                                 FUN = function(x) mean(x))
indel.data.aurora <- indel.data.aurora %>% select(Target, rep, logratio.drug) %>% unique()
indel.data.aurora$mean <- ave(indel.data.aurora$logratio.drug, indel.data.aurora$Target, FUN = function(x) mean(x))
indel.data.aurora$sd <- ave(indel.data.aurora$logratio.drug, indel.data.aurora$Target, FUN = function(x) sd(x))

for (i in unique(indel.data.aurora$Target)) {
    x <- indel.data.aurora[indel.data.aurora$Target == "Negative Control",]
    y <- indel.data.aurora[indel.data.aurora$Target == "Aurora Kinase",]
    
    if (nrow(y)>1) {
    indel.data.aurora$p.value[indel.data.aurora$Target == i] <- 
      t.test(x$logratio.drug, y$logratio.drug)$p.value
    
    indel.data.aurora$statistic[indel.data.aurora$Target == i] <- 
      t.test(x$logratio.drug, y$logratio.drug)$statistic
  
    indel.data.aurora$conf.int1[indel.data.aurora$Target == i] <- 
      t.test(x$logratio.drug, y$logratio.drug)$conf.int[1]
  
    indel.data.aurora$conf.int2[indel.data.aurora$Target == i] <- 
      t.test(x$logratio.drug, y$logratio.drug)$conf.int[2]
    }
}


# Select relevant data
indel.data.aurora <- indel.data.aurora %>% select(Target, mean, sd, p.value) %>% unique()


# t-test for controls
indel.data.controls <- indel.data[indel.data$Drug == "DNA-PKi" |
                            indel.data$Drug == "DMSO" |
                            indel.data$Drug == "Mirin",]
indel.data.controls$logratio.drug <- ave(indel.data.controls$logratio.platenorm, 
                                 indel.data.controls$Drug,
                                 indel.data.controls$rep, FUN = function(x) mean(x))
indel.data.controls <- indel.data.controls %>% select(Drug, rep, logratio.drug) %>% unique()
indel.data.controls$mean <- ave(indel.data.controls$logratio.drug, indel.data.controls$Drug,  FUN = function(x) mean(x))
indel.data.controls$sd <- ave(indel.data.controls$logratio.drug, indel.data.controls$Drug,  FUN = function(x) sd(x))

for (i in unique(indel.data.controls$Drug)) {
    x <- indel.data.controls[indel.data.controls$Drug == "DMSO",]
    y <- indel.data.controls[indel.data.controls$Drug == i,]
    
    if (nrow(y)>1) {
    indel.data.controls$p.value[indel.data.controls$Drug == i] <- 
      t.test(x$logratio.drug, y$logratio.drug)$p.value
    
    indel.data.controls$statistic[indel.data.controls$Drug == i] <- 
      t.test(x$logratio.drug, y$logratio.drug)$statistic
  
    indel.data.controls$conf.int1[indel.data.controls$Drug == i] <- 
      t.test(x$logratio.drug, y$logratio.drug)$conf.int[1]
  
    indel.data.controls$conf.int2[indel.data.controls$Drug == i] <- 
      t.test(x$logratio.drug, y$logratio.drug)$conf.int[2]
    }
}


# Select relevant data
indel.data.controls <- indel.data.controls %>% select(Drug, mean, sd, p.value) %>% unique()


# t-test for individual drugs
indel.data.drugs <- indel.data
indel.data.drugs$logratio.drug <- ave(indel.data.drugs$logratio.platenorm, 
                                 indel.data.drugs$Drug, indel.data$conc,
                                 indel.data.drugs$rep, FUN = function(x) mean(x))
indel.data.drugs <- indel.data.drugs %>% select(Drug, rep, conc, logratio.drug) %>% unique()
indel.data.drugs$mean <- ave(indel.data.drugs$logratio.drug, indel.data.drugs$Drug, indel.data.drugs$conc, FUN = function(x) mean(x))
indel.data.drugs$sd <- ave(indel.data.drugs$logratio.drug, indel.data.drugs$Drug, indel.data.drugs$conc, FUN = function(x) sd(x))

for (j in unique(indel.data.drugs$conc)) {
for (i in unique(indel.data.drugs$Drug)) {
    x <- indel.data.drugs[indel.data.drugs$Drug == "DMSO" & indel.data.drugs$conc == j,]
    y <- indel.data.drugs[indel.data.drugs$Drug == i & indel.data.drugs$conc == j,]
    
    if (nrow(y)>1) {
    indel.data.drugs$p.value[indel.data.drugs$Drug == i & indel.data.drugs$conc == j] <- 
      t.test(x$logratio.drug, y$logratio.drug)$p.value
    
    indel.data.drugs$statistic[indel.data.drugs$Drug == i & indel.data.drugs$conc == j] <- 
      t.test(x$logratio.drug, y$logratio.drug)$statistic
  
    indel.data.drugs$conf.int1[indel.data.drugs$Drug == i & indel.data.drugs$conc == j] <- 
      t.test(x$logratio.drug, y$logratio.drug)$conf.int[1]
  
    indel.data.drugs$conf.int2[indel.data.drugs$Drug == i & indel.data.drugs$conc == j] <- 
      t.test(x$logratio.drug, y$logratio.drug)$conf.int[2]
    }
}
}

# Select relevant data
indel.data.drugs <- indel.data.drugs %>% select(Drug, conc, mean, sd, p.value) %>% unique()

# t-test for each drug, treating integrations as replicates
indel.data.drugs.int <- indel.data
indel.data.drugs.int$logratio.drug <- ave(indel.data.drugs.int$logratio.platenorm, 
                                 indel.data.drugs.int$Drug, indel.data$conc,
                                 indel.data.drugs.int$rep, indel.data.drugs.int$barcode,
                                 FUN = function(x) mean(x))
indel.data.drugs.int <- indel.data.drugs.int %>% select(Drug, rep, conc, classifier, logratio.drug) %>%
  unique()
indel.data.drugs.int$mean <- ave(indel.data.drugs.int$logratio.drug, indel.data.drugs.int$Drug, indel.data.drugs.int$conc, FUN = function(x) mean(x))
indel.data.drugs.int$sd <- ave(indel.data.drugs.int$logratio.drug, indel.data.drugs.int$Drug, indel.data.drugs.int$conc, FUN = function(x) sd(x))

for (j in unique(indel.data.drugs.int$conc)) {
for (i in unique(indel.data.drugs.int$Drug)) {
    x <- indel.data.drugs.int[indel.data.drugs.int$Drug == "DMSO" & indel.data.drugs.int$conc == j,]
    y <- indel.data.drugs.int[indel.data.drugs.int$Drug == i & indel.data.drugs.int$conc == j,]
    
    if (nrow(y)>1) {
    indel.data.drugs.int$p.value[indel.data.drugs.int$Drug == i & indel.data.drugs.int$conc == j] <- 
      t.test(x$logratio.drug, y$logratio.drug)$p.value
    
    indel.data.drugs.int$statistic[indel.data.drugs.int$Drug == i & indel.data.drugs.int$conc == j] <- 
      t.test(x$logratio.drug, y$logratio.drug)$statistic
  
    indel.data.drugs.int$conf.int1[indel.data.drugs.int$Drug == i & indel.data.drugs.int$conc == j] <- 
      t.test(x$logratio.drug, y$logratio.drug)$conf.int[1]
  
    indel.data.drugs.int$conf.int2[indel.data.drugs.int$Drug == i & indel.data.drugs.int$conc == j] <- 
      t.test(x$logratio.drug, y$logratio.drug)$conf.int[2]
    }
}
}

# Adjust p-values for multiple testing and select relevant data
indel.data.drugs.int$p.value.adjust <- p.adjust(indel.data.drugs.int$p.value)
indel.data.drugs.int <- indel.data.drugs.int %>% select(Drug, conc, mean, sd, p.value) %>% unique()



# Run t-test within each well for each barcode
for (i in unique(indel.data$barcode)) {
  
    for (j in unique(indel.data$classifier)) {
    x <- indel.data[indel.data$Drug == "DMSO" & indel.data$barcode == i,]
    y <- indel.data[indel.data$classifier == j & indel.data$barcode == i,]
    
  if (nrow(y)>1) {
    
    indel.data$p.value[indel.data$classifier == j & indel.data$barcode == i] <- 
      t.test(x$logratio.platenorm, y$logratio.platenorm)$p.value
    
    indel.data$statistic[indel.data$classifier == j & indel.data$barcode == i] <- 
      t.test(x$logratio.platenorm, y$logratio.platenorm)$statistic
  
    indel.data$conf.int1[indel.data$classifier == j & indel.data$barcode == i] <- 
      t.test(x$logratio.platenorm, y$logratio.platenorm)$conf.int[1]
  
    indel.data$conf.int2[indel.data$classifier == j & indel.data$barcode == i] <- 
      t.test(x$logratio.platenorm, y$logratio.platenorm)$conf.int[2]
  }
    else {
      indel.data$p.value[indel.data$classifier == j & indel.data$barcode == i] <-
      t.test(x$logratio.platenorm, mu = y$logratio.platenorm)$p.value

      indel.data$statistic[indel.data$classifier == j & indel.data$barcode == i] <-
      t.test(x$logratio.platenorm, mu = y$logratio.platenorm)$statistic

      indel.data$conf.int1[indel.data$classifier == j & indel.data$barcode == i] <-
      t.test(x$logratio.platenorm, mu = y$logratio.platenorm)$conf.int[1]

      indel.data$conf.int2[indel.data$classifier == j & indel.data$barcode == i] <-
      t.test(x$logratio.platenorm, mu = y$logratio.platenorm)$conf.int[2]

      indel.data$conf.int <- abs(indel.data$conf.int2 - indel.data$conf.int1)
    }
    }
}



# Adjust p-values for multiple testing
indel.data$p.value.adjust <- p.adjust(indel.data$p.value)

# Look at specific drugs
indel.data2 <- indel.data
indel.data2$logratio.mean <- ave(indel.data$logratio.platenorm,
                                 indel.data$barcode,
                                 indel.data$Drug,
                                 indel.data$conc,
                                 FUN = function(x) mean(x))
indel.data2$logratio.sd <- ave(indel.data$logratio.platenorm,
                                 indel.data$barcode,
                                 indel.data$Drug,
                                 indel.data$conc,
                                 FUN = function(x) sd(x))
indel.data2 <- indel.data2 %>% select(barcode, Drug, conc, logratio.platenorm, 
                                      p.value.adjust, logratio.mean, logratio.sd) %>% unique()


# Create outlier df based on t-statistic
t.outlier.df <- indel.data[indel.data$p.value <= 0.05,]

# Calculate the mean of the three replicates
indel.data$logratio.mean <- 
  ave(indel.data$logratio.platenorm, 
      indel.data$classifier, indel.data$barcode, FUN = function(x) mean(x))

indel.data$efficiency.mean <- 
  ave(indel.data$efficiency.platenorm, 
      indel.data$classifier, indel.data$barcode, FUN = function(x) mean(x))

Results

Visualization: checking quality of data

Visualization: correlation plots for quality check

Conclusion correlation plots

We can see that there is a very high correlation between all 3 replicates, meaning that we can use the data of all 3 replicates.

Visualizaton: Spotting outliers

##             Df Sum Sq Mean Sq F value Pr(>F)    
## integration 17 10.403  0.6120   15.84 <2e-16 ***
## Residuals   72  2.781  0.0386                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Conclusion

The data looks very nice, we can already spot the first outliers. Now we need to generate more sophisticated plots (like heatmaps) and integrate chromatin data to select hits.

Session Info

## [1] "Run time:  2.563883 mins"
## [1] "/DATA/usr/m.trauernicht/projects/EpiScreen/code/epigenetic-screening-on-trip-clone"
## [1] "Fri May  1 09:49:13 2020"
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.2.5                magrittr_1.5               
##  [3] Laurae_0.0.0.9001           platetools_0.1.2           
##  [5] gghighlight_0.1.0           DESeq2_1.24.0              
##  [7] SummarizedExperiment_1.14.1 DelayedArray_0.10.0        
##  [9] BiocParallel_1.18.1         matrixStats_0.55.0         
## [11] Biobase_2.44.0              GenomicRanges_1.36.1       
## [13] GenomeInfoDb_1.20.0         IRanges_2.18.3             
## [15] S4Vectors_0.22.1            BiocGenerics_0.30.0        
## [17] GGally_1.5.0                ggrepel_0.8.1              
## [19] data.table_1.12.8           ggbeeswarm_0.6.0           
## [21] ggplot2_3.2.1               outliers_0.14              
## [23] dplyr_0.8.5                
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6           bit64_0.9-7            RColorBrewer_1.1-2    
##  [4] tools_3.6.3            backports_1.1.5        R6_2.4.1              
##  [7] rpart_4.1-15           vipor_0.4.5            Hmisc_4.3-0           
## [10] DBI_1.1.0              lazyeval_0.2.2         colorspace_1.4-1      
## [13] nnet_7.3-12            withr_2.1.2            tidyselect_0.2.5      
## [16] gridExtra_2.3          bit_1.1-15             compiler_3.6.3        
## [19] htmlTable_1.13.3       labeling_0.3           scales_1.1.0          
## [22] checkmate_1.9.4        genefilter_1.66.0      stringr_1.4.0         
## [25] digest_0.6.23          foreign_0.8-74         rmarkdown_2.0         
## [28] XVector_0.24.0         base64enc_0.1-3        jpeg_0.1-8.1          
## [31] pkgconfig_2.0.3        htmltools_0.4.0        htmlwidgets_1.5.1     
## [34] rlang_0.4.5            rstudioapi_0.10        RSQLite_2.2.0         
## [37] farver_2.0.1           acepack_1.4.1          RCurl_1.95-4.12       
## [40] GenomeInfoDbData_1.2.1 Formula_1.2-3          Matrix_1.2-18         
## [43] Rcpp_1.0.3             munsell_0.5.0          lifecycle_0.2.0       
## [46] stringi_1.4.6          yaml_2.2.0             zlibbioc_1.30.0       
## [49] plyr_1.8.6             grid_3.6.3             blob_1.2.0            
## [52] crayon_1.3.4           lattice_0.20-38        splines_3.6.3         
## [55] annotate_1.62.0        locfit_1.5-9.1         knitr_1.28            
## [58] pillar_1.4.3           ggsignif_0.6.0         reshape2_1.4.4        
## [61] geneplotter_1.62.0     XML_3.98-1.20          glue_1.3.1            
## [64] evaluate_0.14          latticeExtra_0.6-29    png_0.1-7             
## [67] vctrs_0.2.4            gtable_0.3.0           purrr_0.3.3           
## [70] reshape_0.8.8          assertthat_0.2.1       xfun_0.12             
## [73] xtable_1.8-4           survival_3.1-8         tibble_3.0.1          
## [76] AnnotationDbi_1.46.1   beeswarm_0.2.3         memoise_1.1.0         
## [79] cluster_2.1.0          ellipsis_0.3.0